LSTM-ED for Anomaly Detection in Time Series Data¶

In [ ]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

from dataset import *
from plots import *
from metrics import *
from models_funtions import *

# Set style for matplotlib
plt.style.use("Solarize_Light2")

import plotly.io as pio
pio.renderers.default = "notebook_connected"
In [ ]:
# Path to the root directory of the dataset
ROOTDIR_DATASET_NORMAL =  '../dataset/normal'
ROOTDIR_DATASET_ANOMALY = '../dataset/collisions'

# TF_ENABLE_ONEDNN_OPTS=0 means that the model will not use the oneDNN library for optimization

import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

Variours parameters¶

In [ ]:
#freq = '1.0'
#freq = '0.1'
freq = '0.01'
#freq = '0.005'

file_name_normal = "_20220811_rbtc_"
file_name_collisions = "_collision_20220811_rbtc_"

recording_normal = [0, 2, 3, 4]
recording_collisions = [1, 5]

freq_str = freq.replace(".", "_")
features_folder_normal = f"./features/normal{freq_str}/"
features_folder_collisions = f"./features/collisions{freq_str}/"

Data¶

In [ ]:
df_features_normal, df_normal_raw, _ = get_dataframes(ROOTDIR_DATASET_NORMAL, file_name_normal, recording_normal, freq, f"{features_folder_normal}")
df_features_collisions, df_collisions_raw, df_collisions_raw_action = get_dataframes(ROOTDIR_DATASET_ANOMALY, file_name_collisions, recording_collisions, freq, f"{features_folder_collisions}1_5/")
df_features_collisions_1, df_collisions_raw_1, df_collisions_raw_action_1 = get_dataframes(ROOTDIR_DATASET_ANOMALY, file_name_collisions, [1], freq, f"{features_folder_collisions}1/")
df_features_collisions_5, df_collisions_raw_5, df_collisions_raw_action_5 = get_dataframes(ROOTDIR_DATASET_ANOMALY, file_name_collisions, [5], freq, f"{features_folder_collisions}5/")
Loading data.
Found 31 different actions.
Loading data done.

Loading features from file.
--- 0.05327296257019043 seconds ---
Loading data.
Found 31 different actions.
Loading data done.

Loading features from file.
--- 0.02664017677307129 seconds ---
Loading data.
Found 31 different actions.
Loading data done.

Loading features from file.
--- 0.0205533504486084 seconds ---
Loading data.
Found 31 different actions.
Loading data done.

Loading features from file.
--- 0.018580913543701172 seconds ---
In [ ]:
X_train, y_train, X_test, y_test, df_test = get_train_test_data(df_features_normal, df_features_collisions, full_normal=True)
X_train_1, y_train_1, X_test_1, y_test_1, df_test_1 = get_train_test_data(df_features_normal, df_features_collisions_1, full_normal=True)
X_train_5, y_train_5, X_test_5, y_test_5, df_test_5 = get_train_test_data(df_features_normal, df_features_collisions_5, full_normal=True)
c:\Users\VG User\Documents\GitHub\MLinAPP-FP01-14\.venv\Lib\site-packages\sklearn\base.py:493: UserWarning:

X does not have valid feature names, but VarianceThreshold was fitted with feature names

c:\Users\VG User\Documents\GitHub\MLinAPP-FP01-14\.venv\Lib\site-packages\sklearn\base.py:493: UserWarning:

X does not have valid feature names, but VarianceThreshold was fitted with feature names

c:\Users\VG User\Documents\GitHub\MLinAPP-FP01-14\.venv\Lib\site-packages\sklearn\base.py:493: UserWarning:

X does not have valid feature names, but VarianceThreshold was fitted with feature names

Collisions¶

In [ ]:
collisions_rec1, collisions_init1 = get_collisions('1', ROOTDIR_DATASET_ANOMALY)
collisions_rec5, collisions_init5 = get_collisions('5', ROOTDIR_DATASET_ANOMALY)

# Merge the collisions of the two recordings in one dataframe
collisions_rec = pd.concat([collisions_rec1, collisions_rec5])
collisions_init = pd.concat([collisions_init1, collisions_init5])
In [ ]:
collisions_zones, y_collisions = get_collisions_zones_and_labels(collisions_rec, collisions_init, df_features_collisions)
collisions_zones_1, y_collisions_1 = get_collisions_zones_and_labels(collisions_rec1, collisions_init1, df_features_collisions_1)
collisions_zones_5, y_collisions_5 = get_collisions_zones_and_labels(collisions_rec5, collisions_init5, df_features_collisions_5)

LSTM-AD for Anomaly Detection in Time Series Data¶

In [ ]:
from algorithms.lstm_ad import LSTMAD

def prepare_data_for_lstm(data, len_in):
    """
    Prepare data for LSTM-AD by concatenating every len_in rows.
    """
    n_features = data.shape[1]
    n_samples = data.shape[0] // len_in
    prepared_data = data.iloc[:n_samples * len_in].values.reshape(n_samples, -1)
    return pd.DataFrame(prepared_data, index=data.index[len_in-1:len_in*n_samples:len_in])

# CURRENTLY FUCKS UP FOR VALUES OF LEN_IN AND LEN_OUT DIFFERENT FROM 1
len_in = 1
X_train_lstm = prepare_data_for_lstm(X_train, len_in)
print(X_train_lstm.shape)

classifier = LSTMAD(
    len_in=len_in,         # Input sequence length
    len_out=1,             # Output sequence length (prediction horizon)
    num_epochs=100,         # Number of training epochs
    lr=1e-2,               # Learning rate
    batch_size=1,          # Batch size (usually 1 for time series)
    seed=42,               # Random seed for reproducibility
    gpu=None,              # Set to None for CPU, or specify GPU index if available
    details=True           # Set to True to get detailed predictions
)

# Train the LSTM on normal data
classifier.fit(X_train_lstm)
print("LSTM-AD training completed.")
(973, 118)
100%|██████████| 100/100 [00:54<00:00,  1.82it/s]
LSTM-AD training completed.

Predictions¶

In [ ]:
df_test = get_statistics(X_test, y_collisions, classifier, df_test, freq, threshold_type="mad")
df_test_1 = get_statistics(X_test_1, y_collisions_1, classifier, df_test_1, freq, threshold_type="mad")
df_test_5 = get_statistics(X_test_5, y_collisions_5, classifier, df_test_5, freq, threshold_type="mad")
Anomaly prediction completed.
Number of anomalies detected: 3 with threshold 320189.10040908743, std
Number of anomalies detected: 99 with threshold 4352.728189752417, mad
Number of anomalies detected: 16 with threshold 19752.09464404346, percentile
Number of anomalies detected: 11 with threshold 22140.065470155492, IQR
Number of anomalies detected: 306 with threshold 0.0, zero

choosen threshold type: mad, with value: 4352.7282
F1 Score: 0.8431
Accuracy: 0.8954
Precision: 0.8687
Recall: 0.8190
              precision    recall  f1-score   support

           0       0.91      0.94      0.92       201
           1       0.87      0.82      0.84       105

    accuracy                           0.90       306
   macro avg       0.89      0.88      0.88       306
weighted avg       0.89      0.90      0.89       306

ROC AUC Score: 0.9391
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Anomalies detected: 99
Best threshold: 2000.9874 | F1 Score: 0.8465 | Precision: 0.7500 | Recall: 0.9714
Anomalies detected with best threshold: 136

	-------------------------------------------------------------------------------------

Anomaly prediction completed.
Number of anomalies detected: 1 with threshold 246703.38367839003, std
Number of anomalies detected: 49 with threshold 2064.64320739006, mad
Number of anomalies detected: 9 with threshold 13631.286312833798, percentile
Number of anomalies detected: 21 with threshold 5775.268781973501, IQR
Number of anomalies detected: 164 with threshold 0.0, zero

choosen threshold type: mad, with value: 2064.6432
F1 Score: 0.7381
Accuracy: 0.8659
Precision: 0.6327
Recall: 0.8857
              precision    recall  f1-score   support

           0       0.97      0.86      0.91       129
           1       0.63      0.89      0.74        35

    accuracy                           0.87       164
   macro avg       0.80      0.87      0.82       164
weighted avg       0.89      0.87      0.87       164

ROC AUC Score: 0.9243
c:\Users\VG User\Documents\GitHub\MLinAPP-FP01-14\src\models_funtions.py:67: RuntimeWarning:

invalid value encountered in divide

No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Anomalies detected: 49
Best threshold: 1963.6958 | F1 Score: 0.7674 | Precision: 0.6471 | Recall: 0.9429
Anomalies detected with best threshold: 51

	-------------------------------------------------------------------------------------

Anomaly prediction completed.
Number of anomalies detected: 2 with threshold 391549.0211743849, std
Number of anomalies detected: 11 with threshold 18920.457281060368, mad
Number of anomalies detected: 8 with threshold 21480.948372360093, percentile
Number of anomalies detected: 2 with threshold 31557.158635700933, IQR
Number of anomalies detected: 141 with threshold 0.0, zero

choosen threshold type: mad, with value: 18920.4573
F1 Score: 0.2090
Accuracy: 0.6241
Precision: 0.6364
Recall: 0.1250
              precision    recall  f1-score   support

           0       0.62      0.95      0.75        85
           1       0.64      0.12      0.21        56

    accuracy                           0.62       141
   macro avg       0.63      0.54      0.48       141
weighted avg       0.63      0.62      0.54       141

ROC AUC Score: 0.8895
c:\Users\VG User\Documents\GitHub\MLinAPP-FP01-14\src\models_funtions.py:67: RuntimeWarning:

invalid value encountered in divide

No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Anomalies detected: 11
Best threshold: 6321.9896 | F1 Score: 0.8594 | Precision: 0.7639 | Recall: 0.9821
Anomalies detected with best threshold: 72

	-------------------------------------------------------------------------------------

c:\Users\VG User\Documents\GitHub\MLinAPP-FP01-14\src\models_funtions.py:67: RuntimeWarning:

invalid value encountered in divide

In [ ]:
plot_anomalies_true_and_predicted(df_collisions_raw, df_collisions_raw_action, collisions_zones, df_test, title="Collisions zones vs predicted zones for both recordings")
In [ ]:
plot_anomalies_true_and_predicted(df_collisions_raw_1, df_collisions_raw_action_1, collisions_zones_1, df_test_1, title="Collisions zones vs predicted zones for recording 1")
In [ ]:
plot_anomalies_true_and_predicted(df_collisions_raw_5, df_collisions_raw_action_5, collisions_zones_5, df_test_5, title="Collisions zones vs predicted zones for recording 5")